In [1]:
#Importing all required library
import nltk
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud, STOPWORDS
In [2]:
#Downloading nltk StopWords and Wordnet 
nltk.download('stopwords')
nltk.download('wordnet')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Out[2]:
True
In [5]:
from google.colab import drive

drive.mount('/content/gdrive')
Mounted at /content/gdrive

Dataset 1

In [7]:
d1 = pd.read_csv("/content/gdrive/MyDrive/DataSets/news.csv")
In [8]:
d1.head()
Out[8]:
Unnamed: 0 title text label
0 8476 You Can Smell Hillary’s Fear Daniel Greenfield, a Shillman Journalism Fello... FAKE
1 10294 Watch The Exact Moment Paul Ryan Committed Pol... Google Pinterest Digg Linkedin Reddit Stumbleu... FAKE
2 3608 Kerry to go to Paris in gesture of sympathy U.S. Secretary of State John F. Kerry said Mon... REAL
3 10142 Bernie supporters on Twitter erupt in anger ag... — Kaydee King (@KaydeeKing) November 9, 2016 T... FAKE
4 875 The Battle of New York: Why This Primary Matters It's primary day in New York and front-runners... REAL
In [9]:
d1["Article"] = d1["title"] + d1["text"]
d1.sample(frac = 1) #Shuffle 100%

d1.label[d1.label == 'REAL'] = 1
d1.label[d1.label == 'FAKE'] = 0

d1 = d1.loc[:,['Article','label']]
d1 = d1.dropna()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
In [10]:
d1.head()
Out[10]:
Article label
0 You Can Smell Hillary’s FearDaniel Greenfield,... 0
1 Watch The Exact Moment Paul Ryan Committed Pol... 0
2 Kerry to go to Paris in gesture of sympathyU.S... 1
3 Bernie supporters on Twitter erupt in anger ag... 0
4 The Battle of New York: Why This Primary Matte... 1
In [11]:
def wordpre(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text
In [12]:
d1['Article']=d1['Article'].apply(wordpre)
In [ ]:
#word used in Real news 
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d1[d1.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0xddad550>
In [ ]:
#word used in Fake news 
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d1[d1.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x635d910>

Dataset2

In [13]:
dF2 = pd.read_csv("/content/gdrive/MyDrive/DataSets/Fake.csv")
dT2 = pd.read_csv("/content/gdrive/MyDrive/DataSets/True.csv")
In [14]:
#Counting by Subjects in Real news
for key,count in dT2.subject.value_counts().iteritems():
    print(f"{key}:\t{count}")
    
#Getting Total Rows
print(f"Total Records:\t{dT2.shape[0]}")
politicsNews:	11272
worldnews:	10145
Total Records:	21417
In [15]:
#Counting by Subjects in Fake news
for key,count in dF2.subject.value_counts().iteritems():
    print(f"{key}:\t{count}")
    
#Getting Total Rows
print(f"Total Records:\t{dF2.shape[0]}")
News:	9050
politics:	6841
left-news:	4459
Government News:	1570
US_News:	783
Middle-east:	778
Total Records:	23481
In [16]:
#ploting the Subjects in Real news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=dT2)
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [ ]:
#ploting the Subjects in Fake news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=dF2)
plt.show()
/usr/local/lib/python3.6/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [17]:
dT2['label']= 1
dF2['label']= 0
d2 = pd.concat([dT2, dF2])
d2["Article"] = d2["title"] + d2["text"]
d2.sample(frac = 1) #Shuffle 100%
d2 = d2.loc[:,['Article','label']]
In [18]:
d2
Out[18]:
Article label
0 As U.S. budget fight looms, Republicans flip t... 1
1 U.S. military to accept transgender recruits o... 1
2 Senior U.S. Republican senator: 'Let Mr. Muell... 1
3 FBI Russia probe helped by Australian diplomat... 1
4 Trump wants Postal Service to charge 'much mor... 1
... ... ...
23476 McPain: John McCain Furious That Iran Treated ... 0
23477 JUSTICE? Yahoo Settles E-mail Privacy Class-ac... 0
23478 Sunnistan: US and Allied ‘Safe Zone’ Plan to T... 0
23479 How to Blow $700 Million: Al Jazeera America F... 0
23480 10 U.S. Navy Sailors Held by Iranian Military ... 0

44898 rows × 2 columns

In [19]:
##  Applying the wordpre method to the dataset
d2['Article']=d2['Article'].apply(wordpre)
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d2[d2.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x28429550>
In [ ]:
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d2[d2.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x2a291700>

Dataset 3

In [20]:
dR3 = pd.read_csv("/content/gdrive/MyDrive/DataSets/politifact_real.csv")
dF3 = pd.read_csv("/content/gdrive/MyDrive/DataSets/politifact_fake.csv")
In [21]:
dR3['label']= 1
dF3['label']= 0
df3 = pd.concat([dR3, dF3])
df3["Article"] = df3["title"]
df3.sample(frac = 1) #Shuffle 100%
df3 = df3.loc[:,['Article','label']]
In [22]:
df3
Out[22]:
Article label
0 National Federation of Independent Business 1
1 comments in Fayetteville NC 1
2 Romney makes pitch, hoping to close deal : Ele... 1
3 Democratic Leaders Say House Democrats Are Uni... 1
4 Budget of the United States Government, FY 2008 1
... ... ...
427 Who is affected by the government shutdown? 0
428 Lindsey Graham Threatens To Convert To Democra... 0
429 ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A... 0
430 Sarah Palin Calls To Boycott Mall Of America B... 0
431 Account Suspended 0

1056 rows × 2 columns

In [23]:
df3['Article']=df3['Article'].apply(wordpre)
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df3[df3.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0xdd22730>
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df3[df3.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x284fdf10>

Dataset-4

In [24]:
df4= pd.read_csv("/content/gdrive/MyDrive/DataSets/train.csv")
In [25]:
df4.head()
Out[25]:
id title author text label
0 0 House Dem Aide: We Didn’t Even See Comey’s Let... Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Let... 1
1 1 FLYNN: Hillary Clinton, Big Woman on Campus - ... Daniel J. Flynn Ever get the feeling your life circles the rou... 0
2 2 Why the Truth Might Get You Fired Consortiumnews.com Why the Truth Might Get You Fired October 29, ... 1
3 3 15 Civilians Killed In Single US Airstrike Hav... Jessica Purkiss Videos 15 Civilians Killed In Single US Airstr... 1
4 4 Iranian woman jailed for fictional unpublished... Howard Portnoy Print \nAn Iranian woman has been sentenced to... 1
In [26]:
df4["Article"] = df4["title"] + df4["text"]
df4.sample(frac = 1) #Shuffle 100%

df4 = df4.loc[:,['Article','label']]
df4 = df4.dropna()
In [27]:
df4['Article']=df4['Article'].apply(wordpre)
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df4[df4.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x6837ad90>
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(df4[df4.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x6bf15580>

Dataset -5

In [28]:
d5= pd.read_csv("/content/gdrive/MyDrive/DataSets/data.csv")
In [29]:
d5
Out[29]:
URLs Headline Body Label
0 http://www.bbc.com/news/world-us-canada-414191... Four ways Bob Corker skewered Donald Trump Image copyright Getty Images\nOn Sunday mornin... 1
1 https://www.reuters.com/article/us-filmfestiva... Linklater's war veteran comedy speaks to moder... LONDON (Reuters) - “Last Flag Flying”, a comed... 1
2 https://www.nytimes.com/2017/10/09/us/politics... Trump’s Fight With Corker Jeopardizes His Legi... The feud broke into public view last week when... 1
3 https://www.reuters.com/article/us-mexico-oil-... Egypt's Cheiron wins tie-up with Pemex for Mex... MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin... 1
4 http://www.cnn.com/videos/cnnmoney/2017/10/08/... Jason Aldean opens 'SNL' with Vegas tribute Country singer Jason Aldean, who was performin... 1
... ... ... ... ...
4004 http://beforeitsnews.com/sports/2017/09/trends... Trends to Watch Trends to Watch\n% of readers think this story... 0
4005 http://beforeitsnews.com/u-s-politics/2017/10/... Trump Jr. Is Soon To Give A 30-Minute Speech F... Trump Jr. Is Soon To Give A 30-Minute Speech F... 0
4006 https://www.activistpost.com/2017/09/ron-paul-... Ron Paul on Trump, Anarchism & the AltRight NaN 0
4007 https://www.reuters.com/article/us-china-pharm... China to accept overseas trial data in bid to ... SHANGHAI (Reuters) - China said it plans to ac... 1
4008 http://beforeitsnews.com/u-s-politics/2017/10/... Vice President Mike Pence Leaves NFL Game Beca... Vice President Mike Pence Leaves NFL Game Beca... 0

4009 rows × 4 columns

In [30]:
d5["Article"] = d5["Headline"] + d5["Body"]
d5["label"] = d5["Label"]
d5.sample(frac = 1) #Shuffle 100%
d5 = d5.loc[:,['Article','label']]
d5 = d5.dropna()
In [31]:
d5['Article']=d5['Article'].apply(wordpre)
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d5[d5.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x104a5100>
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(d5[d5.label== 00].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x6720a640>
In [32]:
#combining all the datset into one
frames = [d1, d2, df3, df4,d5]
data = pd.concat(frames)
In [33]:
data.shape
Out[33]:
(76480, 2)
In [34]:
data.head()
Out[34]:
Article label
0 you can smell hillary s feardaniel greenfield ... 0
1 watch the exact moment paul ryan committed pol... 0
2 kerry to go to paris in gesture of sympathyu s... 1
3 bernie supporters on twitter erupt in anger ag... 0
4 the battle of new york why this primary matte... 1

Hypothesis Testing

In [36]:
data.label.value_counts().plot(kind='bar')
plt.title('label')
plt.grid()
plt.show()

print(data.label.value_counts())
0    39584
1    36896
Name: label, dtype: int64
In [40]:
!pip install statsmodels
Requirement already satisfied: statsmodels in /usr/local/lib/python3.7/dist-packages (0.10.2)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from statsmodels) (1.19.5)
Requirement already satisfied: patsy>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from statsmodels) (0.5.2)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.7/dist-packages (from statsmodels) (1.1.5)
Requirement already satisfied: scipy>=0.18 in /usr/local/lib/python3.7/dist-packages (from statsmodels) (1.4.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->statsmodels) (2018.9)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->statsmodels) (2.8.2)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from patsy>=0.4.0->statsmodels) (1.15.0)
In [ ]:
from textblob import TextBlob, Word, Blobber

data['polarity'] = data['Article'].map(lambda text: TextBlob(text).sentiment.polarity)
data['review_len'] = data['Article'].astype(str).apply(len)
data['word_count'] = data['Article'].apply(lambda x: len(str(x).split()))

#Plotting the distribution of the extracted feature
plt.figure(figsize = (20, 5))
plt.style.use('seaborn-white')
plt.subplot(131)
sns.distplot(data['polarity'])
fig = plt.gcf()
plt.subplot(132)
sns.distplot(data['review_len'])
fig = plt.gcf()
plt.subplot(133)
sns.distplot(data['word_count'])
fig = plt.gcf()
/usr/local/lib/python3.6/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.6/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.6/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(data[data.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x7fa345de1588>
In [ ]:
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(data[data.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
Out[ ]:
<matplotlib.image.AxesImage at 0x7fa345751470>
In [ ]:
x_train,x_test,y_train,y_test = train_test_split(data['Article'], data['label'], test_size=0.2, random_state=2021)
In [ ]:
x_train.shape
Out[ ]:
(61184,)
In [ ]:
x_test.shape
Out[ ]:
(15296,)
In [ ]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

Modeling

LogisticRegression

In [ ]:
#LogisticRegression
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

Logisticmodel = pipe.fit(x_train, y_train)
prediction = Logisticmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Logisticmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 86.58%
In [ ]:
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of Logistic Regression Classifier:\n")
print(classification_report(y_test, prediction))
[[7102  807]
 [1246 6141]]

CLassification Report of Logistic Regression Classifier:

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      7909
           1       0.88      0.83      0.86      7387

    accuracy                           0.87     15296
   macro avg       0.87      0.86      0.87     15296
weighted avg       0.87      0.87      0.87     15296

DecisionTreeClassifier

In [ ]:
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 10, 
                                           splitter='best', 
                                           random_state=2020))])
DecisionTreemodel = pipe.fit(x_train, y_train)
prediction = DecisionTreemodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
DecisionTreemodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 81.18%
In [ ]:
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of DecisionTreeClassifier:\n")
print(classification_report(y_test, prediction))
[[7338  571]
 [2308 5079]]

CLassification Report of DecisionTreeClassifier:

              precision    recall  f1-score   support

           0       0.76      0.93      0.84      7909
           1       0.90      0.69      0.78      7387

    accuracy                           0.81     15296
   macro avg       0.83      0.81      0.81     15296
weighted avg       0.83      0.81      0.81     15296

XGBoostClassifier

In [ ]:
pip install xgboost
Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-manylinux2014_x86_64.whl (173.5 MB)
     |████████████████████████████████| 173.5 MB 19 kB/s s eta 0:00:01    |███████████▏                    | 60.4 MB 16.1 MB/s eta 0:00:08     |█████████████████████████▏      | 136.7 MB 18.0 MB/s eta 0:00:03
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.5.4)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost) (1.19.5)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.1
WARNING: You are using pip version 20.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
In [ ]:
from xgboost import XGBClassifier
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', XGBClassifier(loss = 'deviance',
                                                   learning_rate = 0.01,
                                                   n_estimators = 10,
                                                   max_depth = 5,
                                                   random_state=2021))])

xgboostmodel = pipe.fit(x_train, y_train)
prediction = xgboostmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
xgboostmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
/usr/local/lib/python3.6/dist-packages/xgboost/sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
[03:00:31] WARNING: ../src/learner.cc:576: 
Parameters: { "loss" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[03:00:33] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
accuracy: 79.94%
In [ ]:
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of XGBoostClassifier:\n")
print(classification_report(y_test, prediction))
[[7799  110]
 [2959 4428]]

CLassification Report of XGBoostClassifier:

              precision    recall  f1-score   support

           0       0.72      0.99      0.84      7909
           1       0.98      0.60      0.74      7387

    accuracy                           0.80     15296
   macro avg       0.85      0.79      0.79     15296
weighted avg       0.85      0.80      0.79     15296

Multinomial Naive Bayes Classifier

In [ ]:
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', MultinomialNB())])

MNBCmodel = pipe.fit(x_train, y_train)
prediction = MNBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Multinomial_Naive_Bayes_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 78.47%
In [ ]:
print(confusion_matrix(y_test, prediction))
print("\nCLassification Report of Multinomial Naive Bayes Classifier:\n")
print(classification_report(y_test, prediction))
[[7038  871]
 [2422 4965]]

CLassification Report of Multinomial Naive Bayes Classifier:

              precision    recall  f1-score   support

           0       0.74      0.89      0.81      7909
           1       0.85      0.67      0.75      7387

    accuracy                           0.78     15296
   macro avg       0.80      0.78      0.78     15296
weighted avg       0.80      0.78      0.78     15296

LSTM

In [ ]:
from nltk.corpus import stopwords
from collections import Counter

import warnings
warnings.filterwarnings('ignore')
In [ ]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
In [ ]:
stop = stopwords.words('english')
data['Article'] = data['Article'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data.head()
Out[ ]:
Article label
0 smell hillary feardaniel greenfield shillman j... 0
1 watch exact moment paul ryan committed politic... 0
2 kerry go paris gesture sympathyu secretary sta... 1
3 bernie supporters twitter erupt anger dnc trie... 0
4 battle new york primary mattersit primary day ... 1
In [ ]:
news_features=data.copy()
news_features=news_features[['Article']].reset_index(drop=True)
news_features.head()
Out[ ]:
Article
0 smell hillary feardaniel greenfield shillman j...
1 watch exact moment paul ryan committed politic...
2 kerry go paris gesture sympathyu secretary sta...
3 bernie supporters twitter erupt anger dnc trie...
4 battle new york primary mattersit primary day ...
In [ ]:
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

corpus = []
for i in range(0, len(news_features)):
    news = re.sub('[^a-zA-Z]', ' ', news_features['Article'][i])
    news= news.lower()
    news = news.split()
    news = [ps.stem(word) for word in news if not word in stop_words]
    news = ' '.join(news)
    corpus.append(news) 
In [ ]:
corpus[1]
Out[ ]:
'watch exact moment paul ryan commit polit suicid trump ralli video googl pinterest digg linkedin reddit stumbleupon print delici pocket tumblr two fundament truth world paul ryan desper want presid paul ryan never presid today prove particularli stagger exampl polit cowardic paul ryan revers cours announc back trump train aboutfac week ago previous declar would support defend trump tape made public trump brag assault women suddenli ryan appear pro trump ralli boldli declar alreadi sent vote make presid unit state surreal moment figurehead republican parti dose gasolin got stage chilli afternoon wisconsin lit match speakerryan say vote realdonaldtrump republican time come home http co pic twitter com abc news polit abcpolit novemb democrat parti ask better moment film ryan chanc ever becom presid went zero instant wreckag trump leav behind wake cravenli back campaign recov ryan career manag limp way dnc tape lock load use everi ad elect day ring endors man clearli hate person level speak volum spineless ryan postur principl conserv one uncomfort trump unapologet bigotri sexism howev push came shove paul ryan like mani colleagu turn snivel appeas lofti tak convict principl hous card collaps slightest breez especi bizarr close ryan came make unscath month speaker hous refus comment trump strategi seem keep head pretend trump exist hope nobodi rememb happen day away elect screw ugli elect done good expos utter cowardic republican feign moral courag realiti televis star spit hijack parti insult wive got everi last one kneel turn event featur imag via twitter'
In [ ]:
voc_size=10000

onehot_repr=[one_hot(words,voc_size)for words in corpus] 
In [ ]:
sent_length=5000

#Padding the sentences
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)
[[   0    0    0 ... 7592 1194 7958]
 [   0    0    0 ... 5752 9198 4302]
 [   0    0    0 ... 6357 6564 8685]
 ...
 [   0    0    0 ... 3511 7100 4045]
 [   0    0    0 ... 4321 5054 2097]
 [   0    0    0 ... 3221 2617 8575]]
In [ ]:
embedded_docs[1]
Out[ ]:
array([   0,    0,    0, ..., 5752, 9198, 4302], dtype=int32)
In [ ]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(2,2))
# TF-IDF feature matrix
X= tfidf_vectorizer.fit_transform(news_features['Article'])
X.shape
Out[ ]:
(76480, 5000)
In [ ]:
y=data['label']
In [ ]:
len(embedded_docs),y.shape
Out[ ]:
(76480, (76480,))
In [ ]:
# Converting the X and y as array
X_final=np.asarray(embedded_docs).astype(np.float32)
y_final=np.asarray(y).astype(np.float32)

#Check shape of X and y final
X_final.shape,y_final.shape
Out[ ]:
((76480, 5000), (76480,))
In [ ]:
#Creating the lstm model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(100))) #Adding 100 lstm neurons in the layer
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

#Compiling the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 5000, 40)          400000    
_________________________________________________________________
dropout (Dropout)            (None, 5000, 40)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
=================================================================
Total params: 513,001
Trainable params: 513,001
Non-trainable params: 0
_________________________________________________________________
None
In [ ]:
# Train test split of the X and y final
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)
In [ ]:
# Fitting with 10 epochs and 64 batch size
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)
Epoch 1/10
801/801 [==============================] - 695s 856ms/step - loss: 0.4915 - accuracy: 0.7476 - val_loss: 0.3398 - val_accuracy: 0.8521
Epoch 2/10
801/801 [==============================] - 685s 856ms/step - loss: 0.3356 - accuracy: 0.8520 - val_loss: 0.3722 - val_accuracy: 0.8359
Epoch 3/10
801/801 [==============================] - 686s 856ms/step - loss: 0.3096 - accuracy: 0.8691 - val_loss: 0.3126 - val_accuracy: 0.8555
Epoch 4/10
801/801 [==============================] - 686s 856ms/step - loss: 0.2493 - accuracy: 0.8947 - val_loss: 0.2924 - val_accuracy: 0.8685
Epoch 5/10
801/801 [==============================] - 686s 857ms/step - loss: 0.2434 - accuracy: 0.8959 - val_loss: 0.3075 - val_accuracy: 0.8731
Epoch 6/10
801/801 [==============================] - 686s 857ms/step - loss: 0.2615 - accuracy: 0.8939 - val_loss: 0.2980 - val_accuracy: 0.8769
Epoch 7/10
801/801 [==============================] - 687s 858ms/step - loss: 0.2822 - accuracy: 0.8811 - val_loss: 0.3147 - val_accuracy: 0.8586
Epoch 8/10
801/801 [==============================] - 686s 856ms/step - loss: 0.2192 - accuracy: 0.9061 - val_loss: 0.2735 - val_accuracy: 0.8725
Epoch 9/10
801/801 [==============================] - 687s 857ms/step - loss: 0.1711 - accuracy: 0.9278 - val_loss: 0.2710 - val_accuracy: 0.8894
Epoch 10/10
801/801 [==============================] - 687s 857ms/step - loss: 0.1499 - accuracy: 0.9359 - val_loss: 0.2618 - val_accuracy: 0.8897
Out[ ]:
<tensorflow.python.keras.callbacks.History at 0x7fb1968885f8>
In [ ]:
# Predicting from test data
y_pred = model.predict(X_test)
y_pred = np.round(y_pred).astype(int)
LSTM = round(accuracy_score(y_test, y_pred)*100,2)
In [12]:
print(confusion_matrix(y_test, y_pred))
#Calculating Accuracy score
accuracy_score(y_test,y_pred)
print("\nCLassification Report Long-Short Term Memory:\n")
print(classification_report(y_test, y_pred))
[[11804  1412]
 [ 1373 10650]]

CLassification Report of Long-Short Term Memory:

             precision    recall  f1-score   support

        0.0       0.90      0.89      0.89     13216
        1.0       0.88      0.89      0.88     12023

    accuracy                          0.89     25239
   macro avg      0.89      0.89      0.89     25239
weighted avg      0.89      0.89      0.89     25239
In [ ]:
x = [ "Logisticmodel_accuracy",  "xgboostmodel_accuracy" , 
     "DecisionTreemodel_accuracy","Multinomial_Naive_Bayes_accuracy",
     "LSTM"]
y =  [Logisticmodel_accuracy,xgboostmodel_accuracy,
      DecisionTreemodel_accuracy,Multinomial_Naive_Bayes_accuracy,
      LSTM]
plt.barh(x, y)

for index, value in enumerate(y):
    plt.text(value, index, str(value))
In [ ]:
model.save('LSTM_model')
WARNING:absl:Found untraced functions such as lstm_cell_1_layer_call_and_return_conditional_losses, lstm_cell_1_layer_call_fn, lstm_cell_2_layer_call_and_return_conditional_losses, lstm_cell_2_layer_call_fn, lstm_cell_1_layer_call_fn while saving (showing 5 of 10). These functions will not be directly callable after loading.
WARNING:absl:Found untraced functions such as lstm_cell_1_layer_call_and_return_conditional_losses, lstm_cell_1_layer_call_fn, lstm_cell_2_layer_call_and_return_conditional_losses, lstm_cell_2_layer_call_fn, lstm_cell_1_layer_call_fn while saving (showing 5 of 10). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: LSTM_model/assets
INFO:tensorflow:Assets written to: LSTM_model/assets

Hypothesis Testing

In [ ]:
from mlxtend.evaluate import paired_ttest_5x2cv
# check if difference between algorithms is real
t, p = paired_ttest_5x2cv(estimator1=Logesticmodel, 
                          estimator2=model, 
                          X=X_train, 
                          y=y_train, 
                          scoring='accuracy',
                          random_seed=1)
# summarize
print(f'The P-value is = {p:.3f}')
print(f'The t-statistics is = {t:.3f}')
# interpret the result
if p <= 0.05:
    print('Since p<0.05, We can reject the null-hypothesis that both models perform equally well on this dataset. We may conclude that the two algorithms are significantly different.')
else:
    print('Since p>0.05, we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different.')
The P-value is = 0.470
The t-statistics is = -0.782
Since p>0.05, we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different.
In [ ]: